A set of labeled SMS messages + label (ham vs Spam)
In [1]:
import pandas as pd
import string
import re
from sklearn import model_selection
In [2]:
DATASET_FILE = 'data/sms-spam/SMSSpamCollection'
dataset = pd.read_csv(DATASET_FILE, sep='\t', names=['class','sms'])
dataset.head()
Out[2]:
In [3]:
print("Dataset Size: {}".format(len(dataset)))
value_counts = dataset['class'].value_counts()
print(value_counts)
print("ham %: {}".format(round(value_counts[0]/len(dataset)*100,2)))
print("ham %: {}".format(round(value_counts[1]/len(dataset)*100,2)))
In [4]:
exclude = ['\t', '"']
def clean_text(text):
for c in exclude:
text=text.replace(c,'')
return text.lower().strip()
sms_processed = list(map(lambda text: clean_text(text),
dataset['sms'].values))
dataset['sms'] = sms_processed
splitter = model_selection.StratifiedShuffleSplit(n_splits=1,
test_size=0.25,
random_state=19850610)
splits = list(splitter.split(X=dataset['sms'], y=dataset['class']))
train_index = splits[0][0]
valid_index = splits[0][1]
train_df = dataset.loc[train_index,:]
print(len(train_df))
valid_df = dataset.loc[valid_index,:]
print(len(valid_df))
In [5]:
print("Training Set")
training_value_counts = train_df['class'].value_counts()
print(training_value_counts)
print("ham %: {}".format(round(training_value_counts[0]/len(train_df)*100,2)))
print("ham %: {}".format(round(training_value_counts[1]/len(train_df)*100,2)))
print("")
print("Validation Set")
validation_value_counts = valid_df['class'].value_counts()
print(validation_value_counts)
print("ham %: {}".format(round(validation_value_counts[0]/len(valid_df)*100,2)))
print("ham %: {}".format(round(validation_value_counts[1]/len(valid_df)*100,2)))
In [6]:
train_df.to_csv("data/sms-spam/train-data.tsv", header=False, index=False, sep='\t')
valid_df.to_csv("data/sms-spam/valid-data.tsv", header=False, index=False, sep='\t')
In [7]:
pd.read_csv("data/sms-spam/train-data.tsv", sep='\t', names=['class','sms']).tail()
Out[7]:
In [12]:
pd.read_csv("data/sms-spam/valid-data.tsv", sep='\t', names=['class','sms']).tail()
Out[12]:
In [9]:
def get_vocab():
vocab = set()
for text in train_df['sms'].values:
words = text.split(' ')
word_set = set(words)
vocab.update(word_set)
vocab.remove('')
return list(vocab)
In [10]:
vocab = get_vocab()
print(len(vocab))
vocab[10:20]
Out[10]:
In [11]:
PAD_WORD = '#=KS=#'
with open('data/sms-spam/vocab_list.tsv', 'w') as file:
file.write("{}\n".format(PAD_WORD))
for word in vocab:
file.write("{}\n".format(word))
with open('data/sms-spam/n_words.tsv', 'w') as file:
file.write(str(len(vocab)))
In [ ]: